The data has been collected from different sources for analysis are listed below.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sbn
import plotly_express as px
import us
import plotly.offline as py
import plotly.graph_objects as go
import us
import plotly.io as pio
py.init_notebook_mode()
This has been collected from zillow. It containes the ZHVI($, middle tier houses) with region name from year 1996 to 2020.
Zillow Home Value Index (ZHVI): A smoothed, seasonally adjusted measure of the typical home value and market changes across a given region and housing type. It reflects the typical value for homes in the 35th to 65th percentile range.
data = pd.read_csv("Metro_zhvi_uc_sfrcondo_tier_0.33_0.67_raw_mon.csv")
data.head()
# extract city from region name
data.drop(0,axis=0,inplace=True)
data['city'] = data.RegionName.apply(lambda x: x.split(',')[0])
dates = data.columns.tolist()[5:-1]
df = data[['StateName']+dates].melt(id_vars=["StateName"],
var_name="date",
value_name="price")
df.rename(columns={'StateName':'state'},inplace=True)
df.head()
df.price.fillna(0,inplace=True)
df.date = pd.to_datetime(df.date)
df['year'] = df.date.dt.year
df.isna().sum()
px.line(df[['year','price']].groupby('year').mean().reset_index(),
x='year',
y='price',
title='Average house price from year 1996 to 2020')
states = df[['state','date','price']].groupby(['state']).mean().reset_index().sort_values(by='price',ascending=False)[:5]['state'].values.tolist()
px.line(df[df['state'].isin(states)][['state','date','price']].groupby(['state','date']).median().reset_index(),
x='date',
y='price',
color='state',
title='Top 5 expensive state\'s House value index($) from year 1998 to 2020 by state')
Here let's take some key factore which can imapact the housing prices in usa. One of the key factore is economy which is generally measured by GDP, employment rate etc.
Let's first get the data for GDP. This data is collected from https://apps.bea.gov/. which containes the percentage of change in gdp from preceding year. The time period of the data is from year 1997 to 2019
gdp = pd.read_excel('gdp_by_state.xls',skiprows=5,skipfooter=3)
gdp.drop(0,axis=0,inplace=True)
gdp.drop('GeoFips',axis=1,inplace=True)
gdp.head()
year = gdp.columns[2:]
df_gdp = gdp.melt(id_vars=["GeoName"],
var_name="year",
value_name="gdp")
df_gdp.year = df_gdp.year.apply(lambda x: int(x.split('-')[1]))
df_gdp.GeoName = df_gdp.GeoName.apply(lambda x: us.states.lookup(x).abbr if us.states.lookup(x) is not None else x)
df_gdp.rename(columns ={'GeoName':'state'},inplace=True)
df_gdp.head()
states = df_gdp[['state','year','gdp']].groupby(['state']).mean().reset_index().sort_values(by='gdp',ascending=False)[:5]['state'].values.tolist()
px.line(df_gdp[df_gdp['state'].isin(states)][['state','year','gdp']].groupby(['state','year']).median().reset_index(),
x='year',
y='gdp',
color='state',
title='Top 5 states with highest percentage change in gdp from year 1998 to 2019 by state')
df = df.merge(df_gdp,left_on=['state','year'], right_on=['state','year'])
As mentioned earlyer the another key factore of economy is employment rate. This data containes the state wise employment percentage of the total US employement. This data is also from year 1997 to 2019.
emp = pd.read_excel('Employment_by_state.xls',skiprows=5,skipfooter=5)
col = ['GeoName','1997','1998','1999','2000','2001','2002','2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019']
emp.drop(0,inplace=True)
emp = emp[col]
emp.head()
emp = emp.melt(id_vars=["GeoName"],
var_name="year",
value_name="emp_perc")
emp.rename(columns={'GeoName':'state'},inplace=True)
emp.state = emp.state.apply(lambda x: us.states.lookup(x).abbr if us.states.lookup(x) is not None else x)
emp.year = emp.year.astype('int')
emp.emp_perc = emp.emp_perc.astype('float')
states = emp[['state','year','emp_perc']].groupby(['state']).mean().reset_index().sort_values(by='emp_perc',ascending=False)[:5]['state'].values.tolist()
px.line(emp[emp['state'].isin(states)][['state','year','emp_perc']].groupby(['state','year']).median().reset_index(),
x='year',
y='emp_perc',
color='state',
title='Top 5 states with highest percentage of employement of total employment from year 1998 to 2019 by state')
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
tmp = emp[emp['year']>2010].groupby('state')['emp_perc'].mean().reset_index()
tmp = tmp.dropna(subset=['state','emp_perc'], how='any')
data = [ dict(
type='choropleth',
colorscale = scl,
autocolorscale = False,
locations = tmp['state'],
z = tmp['emp_perc'].astype(float),
locationmode = 'USA-states',
marker = dict(
line = dict (
color = 'rgb(255,255,255)',
width = 2
) ),
colorbar = dict(
title = "Employment percentage")
) ]
layout = dict(
title = 'Average Employment percentage in different states after 2010',
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showlakes = True,
lakecolor = 'rgb(255, 255, 255)'),
)
fig = dict( data=data, layout=layout )
py.iplot(fig)
df = df.merge(emp,left_on=['state','year'], right_on=['state','year'])
Interest rate also has major impact on house prices, as most people consider buying house with a morgage. Changes in interest rates can greatly influence a person's ability to purchase a residential property. This data is from year 1971 to 2020. It containes 30 year fixed morgage interest rate average.
interest = pd.read_csv("MORTGAGE30US_intrest_rate.csv")
interest.DATE = pd.to_datetime(interest.DATE)
interest.head()
interest = interest.groupby(interest.DATE.dt.year).mean().reset_index()
interest.columns = ['year','interest_rate']
px.line(interest,x='year',y='interest_rate',title='Morgage interest rate in percentage from year 1971 to 2020')
df = df.merge(interest,left_on='year',right_on='year')
Demographic of the state and contry can also impact the house price. Here population, personal income are considered for analysis. The population data containes the state wise population percentage of total population
population = pd.read_excel('population_by_state.xls',skiprows=5,skipfooter=6)
population = population[col]
population.drop(0,axis=0,inplace=True)
population.head()
population = population.melt(id_vars=["GeoName"],
var_name="year",
value_name="population_perc")
population.rename(columns={'GeoName':'state'},inplace=True)
population.state = population.state.apply(lambda x:x.split(" ")[0])
population.year = population.year.astype('int')
population.population_perc = population.population_perc.astype('float')
population.state = population.state.apply(lambda x: us.states.lookup(x).abbr if us.states.lookup(x) is not None else x)
population.head()
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
tmp = population.groupby('state')['population_perc'].mean().reset_index()
tmp = tmp.dropna(subset=['state','population_perc'], how='any')
data = [ dict(
type='choropleth',
colorscale = scl,
autocolorscale = False,
locations = tmp['state'],
z = tmp['population_perc'].astype(float),
locationmode = 'USA-states',
marker = dict(
line = dict (
color = 'rgb(255,255,255)',
width = 2
) ),
colorbar = dict(
title = "Population percentage")
) ]
layout = dict(
title = 'Average population percentage in different states',
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showlakes = True,
lakecolor = 'rgb(255, 255, 255)'),
)
fig = dict( data=data, layout=layout )
py.iplot(fig)
df = df.merge(population,left_on=['state','year'], right_on=['state','year'])
df.gdp = df.gdp.astype('float')
df['month'] = df.date.dt.month
This data is used to how each state's house prices changes with time.
House Price Index(HPI): A house price index measures the price changes of residential housing as a percentage change from some specific start date.
hpi = pd.read_csv('House_price_by _state.csv')
col = ['Date','AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL',
'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV',
'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA',
'VT', 'WA', 'WI', 'WV', 'WY']
hpi = hpi.melt(id_vars=["Date"],
var_name="state",
value_name="hpi")
hpi.Date = pd.to_datetime(hpi.Date)
hpi.hpi = pd.to_numeric(hpi.hpi)
hpi['year'] = hpi.Date.dt.year
hpi['month'] = hpi.Date.dt.month
hpi.drop(columns='Date',inplace=True)
hpi.head()
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
tmp = hpi.groupby('state')['hpi'].mean().reset_index()
tmp = tmp.dropna(subset=['state','hpi'], how='any')
data = [ dict(
type='choropleth',
colorscale = scl,
autocolorscale = False,
locations = tmp['state'],
z = tmp['hpi'].astype(float),
locationmode = 'USA-states',
marker = dict(
line = dict (
color = 'rgb(255,255,255)',
width = 2
) ),
colorbar = dict(
title = "House price index")
) ]
layout = dict(
title = 'Average hpi in different states',
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showlakes = True,
lakecolor = 'rgb(255, 255, 255)'),
)
fig = dict( data=data, layout=layout )
py.iplot(fig)
df = df.merge(hpi, left_on=['state','year','month'], right_on=['state','year','month'])
The average personal income of us can also imapact the buyer's capability of buying the house. This data containes total monthly US personal income from year 1959 to 2020 in billion dollar.
income = pd.read_excel('personel income.xls',skiprows=10)
income.observation_date = pd.to_datetime(income.observation_date)
income['year'] = income.observation_date.dt.year
income['month'] = income.observation_date.dt.month
income.drop('observation_date',axis=1,inplace=True)
income.head()
px.line(income.groupby('year').mean().reset_index(),x='year',y='PI', title='Average of US personal income in billion dollar from year 1959 to 2020')
df = df.merge(income,left_on=['year','month'],right_on=['year','month'])
df.head()
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
tmp = df.groupby('state')['gdp'].mean().reset_index()
tmp = tmp.dropna(subset=['state','gdp'], how='any')
data = [ dict(
type='choropleth',
colorscale = scl,
autocolorscale = False,
locations = tmp['state'],
z = tmp['gdp'].astype(float),
locationmode = 'USA-states',
marker = dict(
line = dict (
color = 'rgb(255,255,255)',
width = 2
) ),
colorbar = dict(
title = "GDP")
) ]
layout = dict(
title = 'Average percent of change of GDP in different states',
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showlakes = True,
lakecolor = 'rgb(255, 255, 255)'),
)
fig = dict( data=data, layout=layout )
py.iplot(fig)
px.line(df.drop_duplicates(subset=['month','year']),
x='date',
y='hpi',
title='House price index from year 1998 to 2020')
px.line(df,
x='year',
y='population_perc',
color='state',
title='Population percentage from year 1998 to 2020')
px.line(df,
x='year',
y='emp_perc',
color='state',
title='Employment percentage of total employment from year 1998 to 2020 by state',
labels={'emp_perc':'Employment percentage','year':'Year'})
corr = df.corr()
plt.figure(figsize=(10,10))
sbn.heatmap(corr,annot=True,cmap='coolwarm')
plt.title('Feature correlation')
plt.show()
From above heatmap we can conclude
state_data = pd.read_csv('State_time_series.csv')
state_data.drop(columns=['DaysOnZillow_AllHomes',
'InventorySeasonallyAdjusted_AllHomes',
'InventoryRaw_AllHomes', 'MedianPriceCutDollar_AllHomes',
'MedianPriceCutDollar_CondoCoop',
'MedianPriceCutDollar_SingleFamilyResidence','PctOfListingsWithPriceReductionsSeasAdj_AllHomes',
'PctOfListingsWithPriceReductionsSeasAdj_CondoCoop',
'PctOfListingsWithPriceReductionsSeasAdj_SingleFamilyResidence','Sale_Counts_Seas_Adj'],inplace=True)
state_data.Date = pd.to_datetime(state_data.Date)
state_data['year'] = state_data.Date.dt.year
Zillow Home Value Index (ZHVI): A smoothed, seasonally adjusted measure of the typical home value and market changes across a given region and housing type. It reflects the typical value for homes in the 35th to 65th percentile range
var = ['ZHVI_1bedroom',
'ZHVI_2bedroom',
'ZHVI_3bedroom',
'ZHVI_4bedroom',
'ZHVI_5BedroomOrMore',]
tmp = state_data.groupby('year')[var].mean().reset_index()
fig = go.Figure()
for i in var:
fig.add_trace(go.Bar(x =tmp['year'], y=tmp[i], name=i.split('ZHVI_')[1]))
fig.update_layout(title='House value index for different year by no of rooms', xaxis_title='year',yaxis_title='ZHIV value')
fig.show()
tmp = state_data.groupby('RegionName')['ZHVIPerSqft_AllHomes'].mean().reset_index(name = "ZHVIpersqft_mean")
tmp.RegionName = tmp.RegionName.apply(lambda x: us.states.lookup(x).abbr if us.states.lookup(x) is not None else None )
tmp = tmp.dropna(subset=['ZHVIpersqft_mean','RegionName'], how='any')
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
data = [ dict(
type='choropleth',
colorscale = scl,
autocolorscale = False,
locations = tmp['RegionName'],
z = tmp['ZHVIpersqft_mean'].astype(float),
locationmode = 'USA-states',
marker = dict(
line = dict (
color = 'rgb(255,255,255)',
width = 2
) ),
colorbar = dict(
title = "Home value per square foot")
) ]
layout = dict(
title = 'Average house value per square foot in different states',
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showlakes = True,
lakecolor = 'rgb(255, 255, 255)'),
)
fig = dict( data=data, layout=layout )
py.iplot(fig)
px.bar(state_data.groupby('year')['ZHVIPerSqft_AllHomes'].mean().reset_index(),
x='year',
y='ZHVIPerSqft_AllHomes',
title='Average house value per square foot in different year',
labels={'ZHVIPerSqft_AllHomes':'House value index per sqft'})
fig = go.Figure()
var = ['PctOfHomesIncreasingInValues_AllHomes','PctOfHomesDecreasingInValues_AllHomes']
tmp = state_data.groupby('year')[var].median().dropna().reset_index()
for i in var:
fig.add_trace(go.Scatter(x=tmp['year'], y=tmp[i],
mode='lines',
name=i))
fig.update_layout(title ='Percentage of house price increasing and decreasing in values', xaxis_title='year',yaxis_title='Percentage of house price')
fig.show()
fig = go.Figure()
var = ['MedianRentalPrice_1Bedroom',
'MedianRentalPrice_2Bedroom',
'MedianRentalPrice_3Bedroom',
'MedianRentalPrice_4Bedroom',
'MedianRentalPrice_5BedroomOrMore',
'MedianRentalPrice_CondoCoop',
'MedianRentalPrice_DuplexTriplex',
'MedianRentalPrice_MultiFamilyResidence5PlusUnits',
'MedianRentalPrice_SingleFamilyResidence']
tmp = state_data.groupby('year')[var].mean().dropna().reset_index()
for i in var:
fig.add_trace(go.Scatter(x=tmp['year'], y=tmp[i],
mode='lines',
name=i.split('MedianRentalPrice_')[1]))
fig.update_layout(title ='Median rental price per sqft', xaxis_title='year',yaxis_title='Price per sqft')
fig.show()
Key variables that will inpact the house prices are
Some other variable which impacts the house price will be